Setting up the Environment

In [6]:
%%capture
!pip install spacy
!pip install gensim
!pip install pyLDAvis
!python -m spacy download en_core_web_sm
In [ ]:
import re
import spacy
import pickle
import gensim
import logging
import warnings
import numpy as np
import pandas as pd
import gensim.corpora as corpora

from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

from pprint import pprint
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import PlaintextCorpusReader
from gensim.parsing.preprocessing import preprocess_string


%matplotlib inline
pyLDAvis.enable_notebook()
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Creating Input

In [7]:
!rm -rf `find -type d -name .ipynb_checkpoints`
In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
In [9]:
# Setence to Words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
In [11]:
corpus_topics = ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979', 'TRANSFORMED/1980-1989', 
                 'TRANSFORMED/1990-1999', 'TRANSFORMED/2000-2009', 'TRANSFORMED/2010-2019', 'TRANSFORMED/2020-2029']
In [13]:
flag=True

for corpus_root in corpus_topics:
    rows = []
    corpus = PlaintextCorpusReader(corpus_root, '.*txt')
    text_list = corpus.fileids()
    for text in text_list:
        if corpus_root in ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979']:
            identifier = text.split("_")[0]
        else:
            identifier = text.split("_")[-1][:-4]
        rows.append([identifier, " ".join(corpus.words(text))])
    # CREATE DATFRAME
    df = pd.DataFrame.from_records(rows, columns=['id', 'content'])
    
    # CLEANING
    # Convert to list
    data = df['content'].values.tolist()
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    # Sentence to words
    data_words = list(sent_to_words(data))
    
    # BUILD THE BIGRAM AND TRIGRAM MODELS
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    # Faster way to get a sentence clubbed as a bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # INPUT FOR CREATING THE DICTIONARY AND CORPUS NEEDED FOR TOPIC MODELING
    if not flag:
        texts.extend(data_lemmatized)
    else:
        texts = data_lemmatized
        flag = False
        
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print("*"*20+"FINISHED"+"*"*20)
********************FINISHED********************
In [14]:
# with open('news_texts.pkl', 'wb') as f:
#     pickle.dump(texts, f)
# with open('news_corpus.pkl', 'wb') as f:
#     pickle.dump(corpus, f)
# with open('news_dictionary.pkl', 'wb') as f:
#     pickle.dump(dictionary, f)

Model Creation

In [4]:
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)
print("Finished loading..")
Started loading..
Finished loading..
In [ ]:
print("Started training..")
time_slice = [1051, 2900, 1258, 1771, 1520, 2010, 2852, 412]
lda_seq = LdaSeqModel(corpus=corpus, time_slice=time_slice, id2word=dictionary, num_topics=20, chunksize=200, passes=1)
print("Finished training..")
Started training..
/opt/conda/lib/python3.7/site-packages/gensim/models/ldaseqmodel.py:297: RuntimeWarning: divide by zero encountered in double_scalars
  convergence = np.fabs((bound - old_bound) / old_bound)
In [ ]:
# print("Saving model..")
# with open('lda_seq_model.pkl', 'wb') as f:
#     pickle.dump(lda_seq, f)
# print("Saved model!")

Explore Model

In [8]:
with open('lda_seq_model.pkl', 'rb') as f:
    ldaseq = pickle.load(f)
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)
with open('news_texts.pkl', 'rb') as f:
    texts = pickle.load(f)
print("Finished loading..")
Started loading..
Finished loading..

Printing Topics

To print all topics from a particular time-period, simply use print_topics. The input parameter to print_topics is a time-slice option. By passing 0 we are seeing the topics in the 1st time-slice. The result would be a list of lists, where each individual list contains a tuple of the most probable words in the topic. i.e (word, word_probability)

In [9]:
# 20 topcis per time slice. Each topic is made up of keywords.
ldaseq.print_topics(time=0)
Out[9]:
[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('newspaper', 0.09606789463087527),
  ('advertising', 0.05312090521009613),
  ('publisher', 0.039837194670586655),
  ('paper', 0.03074020067403551),
  ('printer', 0.026955371687537154),
  ('power', 0.020145897414895017),
  ('press', 0.014176352632787617),
  ('daily', 0.014045699631740054),
  ('publish', 0.012782711992622972),
  ('print', 0.012658235330653592),
  ('magazine', 0.008146467428744168),
  ('printing', 0.008099167304469568),
  ('publication', 0.008033553714641949),
  ('mailer', 0.007813228488243668),
  ('automate', 0.005986524545437834),
  ('editor', 0.005651443875510482),
  ('report', 0.005569449924498103),
  ('article', 0.00541480738625018),
  ('page', 0.005386137704830074),
  ('circulation', 0.005081797362879027)],
 [('ship', 0.012365599238923846),
  ('system', 0.010590864330765608),
  ('railroad', 0.009473299806559221),
  ('cargo', 0.008836626836704713),
  ('train', 0.007368447712807571),
  ('operation', 0.006603574130182176),
  ('line', 0.006336623626940632),
  ('vessel', 0.006299957827477621),
  ('automate', 0.005470627901870686),
  ('transportation', 0.005453855810950323),
  ('authority', 0.005426993698741521),
  ('container', 0.005395153173148028),
  ('passenger', 0.005319947000425335),
  ('service', 0.005062271677269598),
  ('merchant', 0.004647215335820726),
  ('engineer', 0.004624340011107159),
  ('company', 0.004609875008777542),
  ('official', 0.004404284944756175),
  ('permission', 0.004378372840015303),
  ('operate', 0.0042769707507606)],
 [('people', 0.009950722007528839),
  ('year', 0.006650013416453143),
  ('s', 0.006108719789073657),
  ('thing', 0.005968281899480531),
  ('think', 0.005616948478471469),
  ('permission', 0.005376642819900104),
  ('still', 0.00474891036392372),
  ('world', 0.004271340573007294),
  ('first', 0.0041119764444017964),
  ('go', 0.0037885185854693174),
  ('become', 0.003768013546983366),
  ('never', 0.003704128876450715),
  ('little', 0.0036056395879254665),
  ('work', 0.003334531837095637),
  ('young', 0.003332998094673147),
  ('child', 0.0033127942386714663),
  ('woman', 0.003253973725257995),
  ('great', 0.003183971263306896),
  ('place', 0.0030825916468417334),
  ('ve', 0.0030608112474049733)],
 [('increase', 0.01858994949188471),
  ('industry', 0.01653220306374227),
  ('economic', 0.014378103987671913),
  ('economy', 0.011555811046831076),
  ('labor', 0.010889516451965101),
  ('unemployment', 0.01031511504696658),
  ('production', 0.009218954732922948),
  ('worker', 0.009143191065400319),
  ('government', 0.008494211924566227),
  ('year', 0.007903373487536038),
  ('growth', 0.007388459350588343),
  ('country', 0.006986329641486134),
  ('employment', 0.006871293588269364),
  ('price', 0.006712554857519727),
  ('nation', 0.006163556878036737),
  ('industrial', 0.005744745110641743),
  ('great', 0.005721898678360406),
  ('demand', 0.00555109651332969),
  ('productivity', 0.005279977465559953),
  ('level', 0.005272757222175679)],
 [('company', 0.03438333635295421),
  ('stock', 0.015303246254250039),
  ('share', 0.014543791853908906),
  ('permission', 0.013035912541362507),
  ('yesterday', 0.012308365686785151),
  ('business', 0.010173587072600397),
  ('market', 0.009687938162853666),
  ('price', 0.009148588960964293),
  ('executive', 0.008011533656602251),
  ('investment', 0.006534007699127622),
  ('continue', 0.006426770642137569),
  ('general', 0.00636164965866428),
  ('corporation', 0.006053411370052804),
  ('industry', 0.005801940743886901),
  ('announce', 0.005582035631122625),
  ('fund', 0.005550644215475377),
  ('prohibit', 0.005491192932301378),
  ('automation', 0.005318170228084742),
  ('copyright', 0.005289182041969421),
  ('reproduction', 0.005265225196952057)],
 [('state', 0.008495056263491422),
  ('government', 0.008326685084300268),
  ('people', 0.00748504033273375),
  ('country', 0.007402187790138975),
  ('world', 0.006470591758178122),
  ('political', 0.0060264086088178265),
  ('party', 0.005632274707772578),
  ('public', 0.0051404316128397754),
  ('policy', 0.005122108764211892),
  ('right', 0.005094860739492976),
  ('national', 0.0049572998602585685),
  ('great', 0.0047422585999671064),
  ('president', 0.0045627781425011515),
  ('problem', 0.004516084374651158),
  ('economic', 0.004262290062613127),
  ('nation', 0.004179330791901012),
  ('power', 0.0040385740944748495),
  ('american', 0.003810674817858352),
  ('administration', 0.0037630210675274663),
  ('democratic', 0.0037187993632546786)],
 [('service', 0.021923093203187975),
  ('office', 0.018618269077862664),
  ('check', 0.014954555749711908),
  ('bank', 0.013970333444227828),
  ('machine', 0.013200022817270373),
  ('system', 0.010601057755746714),
  ('number', 0.009293236716048515),
  ('postal', 0.0075842996852989175),
  ('customer', 0.007257795131516029),
  ('account', 0.0058417600082403135),
  ('state', 0.005834821269495697),
  ('money', 0.005785045031141776),
  ('telephone', 0.0056169783070699674),
  ('call', 0.005603363343660584),
  ('automate', 0.005590755315003152),
  ('letter', 0.005264722303862599),
  ('information', 0.0049830429809890045),
  ('company', 0.004794981657722855),
  ('record', 0.0047393715648517045),
  ('ticket', 0.004710083498846137)],
 [('machine', 0.018657095965119438),
  ('soviet', 0.014511914802392775),
  ('control', 0.011877288220216898),
  ('research', 0.009445856664732912),
  ('automation', 0.008633837480360515),
  ('technology', 0.008588858168235163),
  ('engineer', 0.007488782336571709),
  ('human', 0.007212730884565282),
  ('process', 0.00705763350229349),
  ('industrial', 0.006975652850053538),
  ('development', 0.006945268951762575),
  ('computer', 0.006567816397629821),
  ('system', 0.006037750309041726),
  ('science', 0.00600083794598743),
  ('scientist', 0.005724263801525289),
  ('robot', 0.00546616396513126),
  ('field', 0.005388398770925304),
  ('develop', 0.005237755471674822),
  ('scientific', 0.005091197028127519),
  ('method', 0.004689699009544589)],
 [('permission', 0.02061697161017611),
  ('program', 0.018093374685169018),
  ('school', 0.015558306802175468),
  ('reproduction', 0.009999228553334318),
  ('education', 0.009693887456845218),
  ('prohibit', 0.009557678153310493),
  ('reproduce', 0.009507364759887617),
  ('copyright', 0.009145674244678688),
  ('student', 0.00906810570688097),
  ('owner', 0.008894756109613447),
  ('automation', 0.007748977844909375),
  ('training', 0.007309286049411055),
  ('study', 0.006649066528759616),
  ('state', 0.0064857071728344875),
  ('year', 0.006032963195752962),
  ('college', 0.005533228365799739),
  ('group', 0.0053264042442501824),
  ('public', 0.00509738835339857),
  ('negro', 0.004818684881140006),
  ('business', 0.004799958443912522)],
 [('player', 0.02499162601123833),
  ('video', 0.017216940286265893),
  ('track', 0.015818822973546504),
  ('medium', 0.014518547877847633),
  ('bet', 0.013459488346223068),
  ('company', 0.0133392100378268),
  ('league', 0.011676685810224633),
  ('social', 0.011672821277366761),
  ('account', 0.011152639975051204),
  ('broadcast', 0.009749696914135844),
  ('network', 0.009354884777311367),
  ('platform', 0.009048167789714874),
  ('ind', 0.009023893896226537),
  ('user', 0.008654888305796459),
  ('bot', 0.007898592224306531),
  ('facebook', 0.007588068271221454),
  ('post', 0.007416228621226182),
  ('youtube', 0.0069558935820492485),
  ('team', 0.006799660203994795),
  ('sport', 0.0065445205791145235)],
 [('computer', 0.08653165756240233),
  ('system', 0.035316351443099026),
  ('electronic', 0.029973153341881782),
  ('company', 0.018286446591108518),
  ('equipment', 0.01590387109883422),
  ('machine', 0.015840550934628415),
  ('television', 0.012866723616495235),
  ('radio', 0.012188532257576302),
  ('information', 0.011253898648488574),
  ('technology', 0.009885957858338337),
  ('software', 0.0085930040094533),
  ('product', 0.00802089924332822),
  ('network', 0.007651119074087238),
  ('office', 0.0070699709554124315),
  ('program', 0.006672524848576315),
  ('business', 0.006433501014590473),
  ('station', 0.00572223179923852),
  ('telephone', 0.005614906137728465),
  ('device', 0.005573701427481308),
  ('processing', 0.005524225327880115)],
 [('company', 0.027483739641466342),
  ('plant', 0.02633278405034154),
  ('industry', 0.01927664884881896),
  ('production', 0.01742779050041442),
  ('product', 0.015545150730505367),
  ('sale', 0.012194607993943408),
  ('machine', 0.010136267699262432),
  ('store', 0.009831056015060659),
  ('manufacturer', 0.008773338512114393),
  ('factory', 0.00808688134802922),
  ('steel', 0.008026619077273176),
  ('business', 0.00693647703574592),
  ('year', 0.006908260426940148),
  ('price', 0.006251938726100135),
  ('produce', 0.006237055832535299),
  ('large', 0.00575656672227135),
  ('market', 0.005537970810680546),
  ('equipment', 0.005514171476236522),
  ('small', 0.005456502386943131),
  ('operation', 0.005226949622782238)],
 [('plane', 0.01850746110353654),
  ('airline', 0.017009204768873673),
  ('flight', 0.015394466703076476),
  ('pilot', 0.01452770313235427),
  ('aircraft', 0.013971025110287567),
  ('agency', 0.013318731214447559),
  ('traffic', 0.012613302223636243),
  ('system', 0.012146555290589571),
  ('control', 0.00945270861789617),
  ('travel', 0.00909392346181037),
  ('aviation', 0.007842061523965492),
  ('airport', 0.00771365565015372),
  ('controller', 0.007023513347718533),
  ('safety', 0.006775676200996918),
  ('problem', 0.006603064136625036),
  ('computer', 0.005414642270772881),
  ('fly', 0.005348659114066793),
  ('year', 0.005278792187482113),
  ('military', 0.005118468885188332),
  ('passenger', 0.004794905334473019)],
 [('french', 0.023340056271279812),
  ('amer', 0.014724922141500337),
  ('system', 0.012867601912611975),
  ('short', 0.012029456243161842),
  ('tesla', 0.009545266718281077),
  ('first', 0.009354561449154036),
  ('inct', 0.007211273405721458),
  ('stock', 0.007102904647650057),
  ('security', 0.005583341701483589),
  ('model', 0.005429372786616488),
  ('market', 0.005155882813584954),
  ('western', 0.0049245262518794195),
  ('medical', 0.004837195339966721),
  ('position', 0.004473713825861505),
  ('musk', 0.003981102180402508),
  ('food', 0.003788085680733229),
  ('price', 0.003708101771775635),
  ('champion', 0.003699878990902474),
  ('national', 0.0036818874969097456),
  ('health', 0.0035901168138519107)],
 [('permission', 0.04954533471297379),
  ('prohibit', 0.02724513670330728),
  ('owner', 0.02704768073100361),
  ('reproduction', 0.02696951192992844),
  ('copyright', 0.026532242846099428),
  ('reproduce', 0.026195895030701848),
  ('automation', 0.012649187402691935),
  ('building', 0.011651281827532492),
  ('space', 0.011449219850737604),
  ('equipment', 0.011395097550768594),
  ('device', 0.008557611471579712),
  ('continue', 0.007118302693738038),
  ('patent', 0.006981176810429632),
  ('design', 0.006036008275362876),
  ('control', 0.00599441069990523),
  ('build', 0.0057554359011571365),
  ('water', 0.005456202988759333),
  ('office', 0.005212702309664257),
  ('system', 0.005091805308881497),
  ('electronic', 0.005081247094471481)],
 [('union', 0.02724814809589055),
  ('permission', 0.019619495485561498),
  ('labor', 0.016561590450852304),
  ('automation', 0.01290567780077134),
  ('strike', 0.009901004213745086),
  ('worker', 0.009698922397178353),
  ('owner', 0.009480839880955747),
  ('prohibit', 0.009475049971195922),
  ('reproduction', 0.009174798233724598),
  ('reproduce', 0.009129351796430261),
  ('contract', 0.008600653500377916),
  ('copyright', 0.008297253223272133),
  ('industry', 0.007909133298162698),
  ('member', 0.006999783659610874),
  ('agreement', 0.006053547802245419),
  ('increase', 0.0052894276006262095),
  ('management', 0.005176026380584805),
  ('today', 0.0050627846679832215),
  ('negotiation', 0.004649301316310655),
  ('yesterday', 0.004574435827297263)],
 [('camera', 0.02369501135622689),
  ('light', 0.01262421572421739),
  ('picture', 0.008075757056533373),
  ('automatic', 0.008060379115915254),
  ('color', 0.0070889356204260475),
  ('speed', 0.006445916936437219),
  ('exposure', 0.006254751767765626),
  ('control', 0.005361130151208236),
  ('model', 0.005119036777105239),
  ('available', 0.004135797198844008),
  ('photographer', 0.004010304317582235),
  ('flash', 0.003868306994198768),
  ('second', 0.003825899354507225),
  ('feature', 0.0037092899593370128),
  ('design', 0.00358215448018945),
  ('small', 0.0035195786753407263),
  ('take', 0.003519368049938951),
  ('offer', 0.0034738599237142866),
  ('photography', 0.003460619078209394),
  ('automation', 0.003458869868915345)],
 [('music', 0.009895354436537605),
  ('direct', 0.007887800178259338),
  ('theater', 0.007296888287877855),
  ('exhibition', 0.007167145536072739),
  ('today', 0.006497541941914068),
  ('artist', 0.006290297494902606),
  ('close', 0.006175467153483543),
  ('open', 0.0058133405004078195),
  ('painting', 0.005362120068005614),
  ('musical', 0.004952512123165143),
  ('mon', 0.0046437407626743294),
  ('movie', 0.0045563787815507545),
  ('dance', 0.004200322629975145),
  ('play', 0.004131673539855562),
  ('work', 0.004020585993191775),
  ('first', 0.0038573301590078797),
  ('th', 0.0037989252554847543),
  ('gallery', 0.0035169410906955093),
  ('closed', 0.0034522251443237875),
  ('other', 0.003450772517015871)],
 [('income', 0.07410517480389289),
  ('sale', 0.0609328315072059),
  ('earn', 0.04768356804128749),
  ('share', 0.046404012138632586),
  ('cent', 0.024499765128165325),
  ('earning', 0.0223157669716508),
  ('shr', 0.02148704068282033),
  ('quarter', 0.018658315256049617),
  ('company', 0.01715355160657727),
  ('revenue', 0.016182863117282568),
  ('report', 0.015185602661006459),
  ('month', 0.01060821103758829),
  ('qtr', 0.009830465857372832),
  ('period', 0.009652414286836988),
  ('credit', 0.007589122159118673),
  ('compare', 0.007189551738609515),
  ('profit', 0.00698371241299447),
  ('industry', 0.006725513791469736),
  ('first', 0.006681227890385564),
  ('operation', 0.006329828079396752)]]

Looking for Topic Evolution

To fix a topic and see it evolve, use print_topic_times. The input parameter is the topic_id In this case, we are looking at the evolution of the technology topic.

In [10]:
ldaseq.print_topic_times(topic=0) # evolution of 1st topic
Out[10]:
[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('exchange', 0.0331196315128335),
  ('stock', 0.02092678261073136),
  ('market', 0.014609282235584152),
  ('security', 0.013194687781034328),
  ('trading', 0.012613570037657717),
  ('system', 0.012467251676093596),
  ('permission', 0.011636224244246431),
  ('volume', 0.009904017298788858),
  ('board', 0.009275352210749006),
  ('firm', 0.009212339138015495),
  ('member', 0.008245979822233058),
  ('trade', 0.007969984762220867),
  ('order', 0.0078123035660590924),
  ('floor', 0.007801816998909722),
  ('brokerage', 0.006959332979403503),
  ('broker', 0.006484932170801021),
  ('street', 0.0060828198222007464),
  ('automation', 0.005635194277664037),
  ('continue', 0.005411381729338492),
  ('prohibit', 0.0052571426489738445)],
 [('exchange', 0.0327668096644913),
  ('market', 0.020224352321911464),
  ('stock', 0.019540435405281043),
  ('trading', 0.014364352568187817),
  ('system', 0.013622012408935739),
  ('security', 0.012540665220901862),
  ('trade', 0.009437102510416115),
  ('firm', 0.009020504698684973),
  ('board', 0.008242158049337577),
  ('order', 0.008150256103346428),
  ('permission', 0.0077112996749747606),
  ('volume', 0.007694788555035292),
  ('floor', 0.006993435292938503),
  ('member', 0.006391637956339135),
  ('brokerage', 0.0060458566596990875),
  ('hospital', 0.005838417714809661),
  ('price', 0.005710949840151155),
  ('broker', 0.005671292860470882),
  ('street', 0.005462868974053713),
  ('share', 0.004540939137808188)],
 [('exchange', 0.03007145488225922),
  ('market', 0.021436591271085614),
  ('stock', 0.018354846600316676),
  ('trading', 0.016855026641485298),
  ('system', 0.014391177373105251),
  ('trade', 0.011051770646114806),
  ('security', 0.008855833034999353),
  ('firm', 0.008068452457711726),
  ('order', 0.008025635340232587),
  ('hospital', 0.007348555372113743),
  ('board', 0.006834965498695129),
  ('price', 0.00635553858614998),
  ('floor', 0.00608295393243232),
  ('volume', 0.0059213822993153895),
  ('health', 0.005199313418848411),
  ('brokerage', 0.005170662044432769),
  ('trader', 0.004901271430651695),
  ('broker', 0.004893733375873561),
  ('specialist', 0.004647581381634004),
  ('member', 0.004597337271095108)],
 [('exchange', 0.02427050058790468),
  ('market', 0.018778399739240916),
  ('trading', 0.018526578905020602),
  ('stock', 0.01659102990702977),
  ('system', 0.013858301878262324),
  ('trade', 0.011677902134152082),
  ('hospital', 0.00824085444990267),
  ('health', 0.008215994757157665),
  ('firm', 0.007800242967767463),
  ('order', 0.007107223422484484),
  ('price', 0.0066789527716124295),
  ('patient', 0.006070757781817086),
  ('trader', 0.005772397070576905),
  ('security', 0.00561232389339052),
  ('board', 0.005390571710122757),
  ('floor', 0.005213519619956403),
  ('doctor', 0.005096074446515188),
  ('medical', 0.005058182155013028),
  ('electronic', 0.0050554679156972655),
  ('broker', 0.004518951157446992)],
 [('trading', 0.018720348873374593),
  ('exchange', 0.017585690322573906),
  ('market', 0.015462853544863639),
  ('stock', 0.013002941949816212),
  ('health', 0.012301500495594482),
  ('system', 0.012167959266720492),
  ('trade', 0.01113204472325179),
  ('firm', 0.009046096955799486),
  ('hospital', 0.007830671872992948),
  ('patient', 0.007828839675304086),
  ('price', 0.006732421489141514),
  ('doctor', 0.0065496056660843795),
  ('medical', 0.006275422110576385),
  ('order', 0.006144387672810755),
  ('trader', 0.005829600247052026),
  ('electronic', 0.005338107336447084),
  ('floor', 0.004406537179431894),
  ('board', 0.004257949713702873),
  ('public', 0.004239189229906642),
  ('specialist', 0.0041487077977271)],
 [('trading', 0.017498117023685308),
  ('health', 0.016084907887894507),
  ('market', 0.015379612821735703),
  ('stock', 0.012814683080312361),
  ('exchange', 0.012231688985304214),
  ('firm', 0.011407955117219468),
  ('system', 0.010554599852718808),
  ('trade', 0.00979409042873601),
  ('patient', 0.00902580332666547),
  ('hospital', 0.008834415143288988),
  ('doctor', 0.007666074416386521),
  ('medical', 0.007379682043586188),
  ('price', 0.006401460758608612),
  ('trader', 0.00609552252459637),
  ('order', 0.0058231605877439715),
  ('public', 0.004774263351396775),
  ('electronic', 0.004722251811883048),
  ('investor', 0.0040825622191675084),
  ('automate', 0.004053015327299393),
  ('test', 0.003908703140890568)],
 [('health', 0.029383616053607144),
  ('trading', 0.01571910395065879),
  ('market', 0.011219201971880078),
  ('hospital', 0.010971427848568336),
  ('system', 0.00974351235330832),
  ('stock', 0.009707326473732136),
  ('exchange', 0.009524757767081194),
  ('firm', 0.00912003163705149),
  ('patient', 0.008715766063859412),
  ('trade', 0.008564227456765699),
  ('doctor', 0.00818769550723332),
  ('medical', 0.008070800323655163),
  ('virus', 0.0063314116105405325),
  ('test', 0.006081996351203265),
  ('public', 0.006054273244108267),
  ('price', 0.0058544262386458126),
  ('order', 0.005486132014606381),
  ('trader', 0.005271502196686805),
  ('report', 0.004642354392985513),
  ('coronavirus', 0.004260691352619709)]]
In [15]:
ldaseq.print_topic_times(topic=0) # evolution of 1st topic
Out[15]:
[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('exchange', 0.0331196315128335),
  ('stock', 0.02092678261073136),
  ('market', 0.014609282235584152),
  ('security', 0.013194687781034328),
  ('trading', 0.012613570037657717),
  ('system', 0.012467251676093596),
  ('permission', 0.011636224244246431),
  ('volume', 0.009904017298788858),
  ('board', 0.009275352210749006),
  ('firm', 0.009212339138015495),
  ('member', 0.008245979822233058),
  ('trade', 0.007969984762220867),
  ('order', 0.0078123035660590924),
  ('floor', 0.007801816998909722),
  ('brokerage', 0.006959332979403503),
  ('broker', 0.006484932170801021),
  ('street', 0.0060828198222007464),
  ('automation', 0.005635194277664037),
  ('continue', 0.005411381729338492),
  ('prohibit', 0.0052571426489738445)],
 [('exchange', 0.0327668096644913),
  ('market', 0.020224352321911464),
  ('stock', 0.019540435405281043),
  ('trading', 0.014364352568187817),
  ('system', 0.013622012408935739),
  ('security', 0.012540665220901862),
  ('trade', 0.009437102510416115),
  ('firm', 0.009020504698684973),
  ('board', 0.008242158049337577),
  ('order', 0.008150256103346428),
  ('permission', 0.0077112996749747606),
  ('volume', 0.007694788555035292),
  ('floor', 0.006993435292938503),
  ('member', 0.006391637956339135),
  ('brokerage', 0.0060458566596990875),
  ('hospital', 0.005838417714809661),
  ('price', 0.005710949840151155),
  ('broker', 0.005671292860470882),
  ('street', 0.005462868974053713),
  ('share', 0.004540939137808188)],
 [('exchange', 0.03007145488225922),
  ('market', 0.021436591271085614),
  ('stock', 0.018354846600316676),
  ('trading', 0.016855026641485298),
  ('system', 0.014391177373105251),
  ('trade', 0.011051770646114806),
  ('security', 0.008855833034999353),
  ('firm', 0.008068452457711726),
  ('order', 0.008025635340232587),
  ('hospital', 0.007348555372113743),
  ('board', 0.006834965498695129),
  ('price', 0.00635553858614998),
  ('floor', 0.00608295393243232),
  ('volume', 0.0059213822993153895),
  ('health', 0.005199313418848411),
  ('brokerage', 0.005170662044432769),
  ('trader', 0.004901271430651695),
  ('broker', 0.004893733375873561),
  ('specialist', 0.004647581381634004),
  ('member', 0.004597337271095108)],
 [('exchange', 0.02427050058790468),
  ('market', 0.018778399739240916),
  ('trading', 0.018526578905020602),
  ('stock', 0.01659102990702977),
  ('system', 0.013858301878262324),
  ('trade', 0.011677902134152082),
  ('hospital', 0.00824085444990267),
  ('health', 0.008215994757157665),
  ('firm', 0.007800242967767463),
  ('order', 0.007107223422484484),
  ('price', 0.0066789527716124295),
  ('patient', 0.006070757781817086),
  ('trader', 0.005772397070576905),
  ('security', 0.00561232389339052),
  ('board', 0.005390571710122757),
  ('floor', 0.005213519619956403),
  ('doctor', 0.005096074446515188),
  ('medical', 0.005058182155013028),
  ('electronic', 0.0050554679156972655),
  ('broker', 0.004518951157446992)],
 [('trading', 0.018720348873374593),
  ('exchange', 0.017585690322573906),
  ('market', 0.015462853544863639),
  ('stock', 0.013002941949816212),
  ('health', 0.012301500495594482),
  ('system', 0.012167959266720492),
  ('trade', 0.01113204472325179),
  ('firm', 0.009046096955799486),
  ('hospital', 0.007830671872992948),
  ('patient', 0.007828839675304086),
  ('price', 0.006732421489141514),
  ('doctor', 0.0065496056660843795),
  ('medical', 0.006275422110576385),
  ('order', 0.006144387672810755),
  ('trader', 0.005829600247052026),
  ('electronic', 0.005338107336447084),
  ('floor', 0.004406537179431894),
  ('board', 0.004257949713702873),
  ('public', 0.004239189229906642),
  ('specialist', 0.0041487077977271)],
 [('trading', 0.017498117023685308),
  ('health', 0.016084907887894507),
  ('market', 0.015379612821735703),
  ('stock', 0.012814683080312361),
  ('exchange', 0.012231688985304214),
  ('firm', 0.011407955117219468),
  ('system', 0.010554599852718808),
  ('trade', 0.00979409042873601),
  ('patient', 0.00902580332666547),
  ('hospital', 0.008834415143288988),
  ('doctor', 0.007666074416386521),
  ('medical', 0.007379682043586188),
  ('price', 0.006401460758608612),
  ('trader', 0.00609552252459637),
  ('order', 0.0058231605877439715),
  ('public', 0.004774263351396775),
  ('electronic', 0.004722251811883048),
  ('investor', 0.0040825622191675084),
  ('automate', 0.004053015327299393),
  ('test', 0.003908703140890568)],
 [('health', 0.029383616053607144),
  ('trading', 0.01571910395065879),
  ('market', 0.011219201971880078),
  ('hospital', 0.010971427848568336),
  ('system', 0.00974351235330832),
  ('stock', 0.009707326473732136),
  ('exchange', 0.009524757767081194),
  ('firm', 0.00912003163705149),
  ('patient', 0.008715766063859412),
  ('trade', 0.008564227456765699),
  ('doctor', 0.00818769550723332),
  ('medical', 0.008070800323655163),
  ('virus', 0.0063314116105405325),
  ('test', 0.006081996351203265),
  ('public', 0.006054273244108267),
  ('price', 0.0058544262386458126),
  ('order', 0.005486132014606381),
  ('trader', 0.005271502196686805),
  ('report', 0.004642354392985513),
  ('coronavirus', 0.004260691352619709)]]

Visualising Dynamic Topic Models

In [28]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[28]:
In [29]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=1, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[29]:
In [30]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=2, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[30]:
In [31]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=3, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[31]:
In [36]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=4, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[36]:
In [37]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=5, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[37]:
In [38]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=6, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[38]:
In [39]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=7, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
Out[39]:

Topic Coherence

In [18]:
for time in range(0, 8):
    topics_dtm = ldaseq.dtm_coherence(time=time)
    cm_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    print("U_mass topic coherence for time slice {} is {}".format(time, cm_DTM.get_coherence()))
    cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')
    print("C_v topic coherence for time slice {} is {}\n".format(time, cm_DTM.get_coherence()))
U_mass topic coherence for time slice 0 is -1.5541729439538832
C_v topic coherence for time slice 0 is 0.5107585530532012

U_mass topic coherence for time slice 1 is -1.5637246263545326
C_v topic coherence for time slice 1 is 0.51689783972042

U_mass topic coherence for time slice 2 is -1.5503575266253393
C_v topic coherence for time slice 2 is 0.5130503285375594

U_mass topic coherence for time slice 3 is -1.556059172869139
C_v topic coherence for time slice 3 is 0.5000063761521739

U_mass topic coherence for time slice 4 is -1.574721515162128
C_v topic coherence for time slice 4 is 0.4947732734552862

U_mass topic coherence for time slice 5 is -1.5449613886087166
C_v topic coherence for time slice 5 is 0.4977619376575218

U_mass topic coherence for time slice 6 is -1.5703996916354193
C_v topic coherence for time slice 6 is 0.5144505701987887

U_mass topic coherence for time slice 7 is -1.6126666979742343
C_v topic coherence for time slice 7 is 0.510858620949502

In [ ]: